Rem $Header: rdbms/demo/dmocdemo.sql /main/13 2012/05/14 06:52:30 jiawang Exp $
Rem
Rem dmocdemo.sql
Rem
Rem Copyright (c) 2004, 2012, Oracle and/or its affiliates. 
Rem All rights reserved. 
Rem
Rem    NAME
Rem      dmocdemo.sql - Sample program for the DBMS_DATA_MINING package
Rem
Rem    DESCRIPTION
Rem      This script creates a clustering model
Rem      using the O-Cluster algorithm
Rem      and data in the SH (Sales History) schema in the RDBMS.
Rem
Rem    NOTES
Rem     
Rem
Rem    MODIFIED   (MM/DD/YY) 
Rem    amozes      01/26/12 - updates for 12c
Rem    amozes      01/23/12 - use ADP
Rem    xbarr       01/10/12 - add cluster_details
Rem    ramkrish    06/14/07 - remove commit after settings
Rem    ramkrish    10/25/07 - replace deprecated get_model calls with catalog
Rem                           queries
Rem    ktaylor     07/11/05 - minor edits to comments
Rem    jcjeon      01/18/05 - add column format 
Rem    mmcampos    11/10/04 - edit comments
Rem    ramkrish    10/26/04 - add data analysis and comments/cleanup 
Rem    gtang       08/25/04 - Fix bug #3808314 
Rem    gtang       08/18/04 - gtang_bug-3785950
Rem    gtang       08/09/04 - gtang_bug-3785785
Rem    gtang       07/23/04 - gtang_bug-3753633
Rem    gtang       07/15/04 - Renamed and moved to rdbms/demo
Rem    gtang       06/15/04 - add apply
Rem    gtang       05/20/04 - add get_model_details_oc
Rem    gtang       05/06/04 - gtang_txn110497
Rem    gtang       04/27/04 - Creation
Rem

SET serveroutput ON
SET trimspool ON  
SET pages 10000
SET linesize 120
SET echo ON

-----------------------------------------------------------------------
--                            SAMPLE PROBLEM
-----------------------------------------------------------------------
-- Segment the demographic data into 10 clusters and study the individual
-- clusters. Rank the clusters on probability.

-----------------------------------------------------------------------
--                            SET UP AND ANALYZE THE DATA
-----------------------------------------------------------------------

-- The data for this sample is composed from base tables in SH Schema
-- (See Sample Schema Documentation) and presented through these views:
-- mining_data_build_v (build data)
-- mining_data_test_v  (test data)
-- mining_data_apply_v (apply data)
-- (See dmsh.sql for view definitions).
--

-----------
-- ANALYSIS
-----------
-- For clustering using OC, perform the following on mining data.
--
-- 1. Use Data Auto Preparation
--    O-Cluster uses a special binning procedure that automatically 
--    determines the number of bins based on data statistics.
--

-----------------------------------------------------------------------
--                            BUILD THE MODEL
-----------------------------------------------------------------------

-- Cleanup old model with the same name for repeat runs
BEGIN DBMS_DATA_MINING.DROP_MODEL('OC_SH_Clus_sample');
EXCEPTION WHEN OTHERS THEN NULL; END;
/

-------------------
-- SPECIFY SETTINGS
--
-- Cleanup old settings table for repeat runs
--
BEGIN EXECUTE IMMEDIATE 'DROP TABLE oc_sh_sample_settings';
EXCEPTION WHEN OTHERS THEN NULL; END;
/

-- K-Means is the default clustering algorithm. Override the
-- default to set the algorithm to O-Cluster using a settings table.
-- 
-- CREATE AND POPULATE A SETTINGS TABLE
--
set echo off
CREATE TABLE oc_sh_sample_settings (
  setting_name  VARCHAR2(30), 
  setting_value VARCHAR2(4000));
set echo on
BEGIN
  INSERT INTO oc_sh_sample_settings VALUES
    (dbms_data_mining.algo_name, dbms_data_mining.algo_ocluster);
  INSERT INTO oc_sh_sample_settings VALUES
    (dbms_data_mining.clus_num_clusters, 10);
  INSERT INTO oc_sh_sample_settings VALUES
    (dbms_data_mining.oclt_max_buffer, 30000);
  INSERT INTO oc_sh_sample_settings VALUES
    (dbms_data_mining.prep_auto, dbms_data_mining.prep_auto_on);

  -- Other possible settings are:
  -- (dbms_data_mining.oclt_sensitivity, 0.5);
END;
/

---------------------
-- CREATE A NEW MODEL
--
-- Build a new OC model
-- TO_CHAR function is used to transform columns to 
-- categorical attributes since numeric datatypes 
-- are treated as numeric attributes.
DECLARE
  xformlist dbms_data_mining_transform.TRANSFORM_LIST;
BEGIN
  dbms_data_mining_transform.SET_TRANSFORM(
    xformlist, 'AFFINITY_CARD', null, 'TO_CHAR(AFFINITY_CARD)', null);
  dbms_data_mining_transform.SET_TRANSFORM(
    xformlist, 'BOOKKEEPING_APPLICATION', null, 'TO_CHAR(BOOKKEEPING_APPLICATION)', null);
  dbms_data_mining_transform.SET_TRANSFORM(
    xformlist, 'BULK_PACK_DISKETTES', null, 'TO_CHAR(BULK_PACK_DISKETTES)', null);
  dbms_data_mining_transform.SET_TRANSFORM(
    xformlist, 'FLAT_PANEL_MONITOR', null, 'TO_CHAR(FLAT_PANEL_MONITOR)', null);
  dbms_data_mining_transform.SET_TRANSFORM(
    xformlist, 'HOME_THEATER_PACKAGE', null, 'TO_CHAR(HOME_THEATER_PACKAGE)', null);
  dbms_data_mining_transform.SET_TRANSFORM(
    xformlist, 'OS_DOC_SET_KANJI', null, 'TO_CHAR(OS_DOC_SET_KANJI)', null);
  dbms_data_mining_transform.SET_TRANSFORM(
    xformlist, 'PRINTER_SUPPLIES', null, 'TO_CHAR(PRINTER_SUPPLIES)', null);
  dbms_data_mining_transform.SET_TRANSFORM(
    xformlist, 'Y_BOX_GAMES', null, 'TO_CHAR(Y_BOX_GAMES)', null);
  DBMS_DATA_MINING.CREATE_MODEL(
    model_name          => 'OC_SH_Clus_sample',
    mining_function     => dbms_data_mining.clustering,
    data_table_name     => 'mining_data_build_v',
    case_id_column_name => 'cust_id',
    settings_table_name => 'oc_sh_sample_settings',
    xform_list => xformlist);
END;
/ 

-------------------------
-- DISPLAY MODEL SETTINGS
--
column setting_name format a30
column setting_value format a30
SELECT setting_name, setting_value
  FROM user_mining_model_settings
 WHERE model_name = 'OC_SH_CLUS_SAMPLE'
ORDER BY setting_name;

--------------------------
-- DISPLAY MODEL SIGNATURE
--
column attribute_name format a40
column attribute_type format a20
SELECT attribute_name, attribute_type
  FROM user_mining_model_attributes
 WHERE model_name = 'OC_SH_CLUS_SAMPLE'
ORDER BY attribute_name;

-------------------------
-- DISPLAY MODEL METADATA
--
column mining_function format a20
column algorithm format a20
column model_size format 99.99
SELECT mining_function, algorithm, model_size
  FROM user_mining_models
 WHERE model_name = 'OC_SH_CLUS_SAMPLE';

------------------------
-- DISPLAY MODEL DETAILS
--
-- Cluster details are best seen in pieces - based on the kind of
-- associations and groupings that are needed to be observed.
--

-- CLUSTERS
-- For each cluster_id, provides the number of records in the cluster,
-- the parent cluster id, and the level in the hierarchy.
-- NOTE: Unlike K-means, O-Cluster does not return a value for the
--       dispersion associated with a cluster.
--
SELECT id clu_id, record_count rec_cnt, parent, tree_level, dispersion
  FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_OC('OC_SH_Clus_sample',null,null,0,0,0))
 ORDER BY id;

-- TAXONOMY
-- 
SELECT T.id clu_id, C.id child_id
  FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_OC('OC_SH_Clus_sample',null,null,0,0,0)) T,
       TABLE(T.child) C
ORDER BY T.id, C.id;

-- SPLIT PREDICATES
-- For each cluster, the split predicate indicates the attribute
-- and the condition used to assign records to the cluster's children
-- during model build. It provides an important piece of information
-- on how the population within a cluster can be divided up into
-- two smaller clusters.
--
column attribute_name format a20
column op format a2
column s_value format a50
SELECT a.id clu_id, sp.attribute_name, sp.conditional_operator op,
       sp.attribute_str_value s_value
  FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_OC('OC_SH_Clus_sample',null,null,0,0,0)) a,
       TABLE(a.split_predicate) sp
ORDER BY a.id, op, s_value;

-- CENTROIDS FOR LEAF CLUSTERS
-- For cluster_id 1, this output lists all the attributes that
-- constitute the centroid, with the mean (for numericals) or
-- mode (for categoricals). Unlike K-Means, O-Cluster does not return 
-- the variance for numeric attributes.
--
column mode_value format a60
SELECT T.id clu_id,
       C.attribute_name,
       C.mean,
       C.mode_value
  FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_OC('OC_SH_Clus_sample',1,null,1,0,0)) T,
       TABLE(T.centroid) C
ORDER BY attribute_name;

-- HISTOGRAM FOR ATTRIBUTE OF A LEAF CLUSTER
-- For cluster 1, provide the histogram for the AGE attribute.
-- Histogram count is represented in frequency, rather than actual count.
column count format 9999.99
column bin_id format 9999999
column clu_id format 99999999
column label format a20;
SELECT a.id clu_id, h.bin_id, h.attribute_name, h.label, h.count cnt
  FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_OC('OC_SH_Clus_sample',1,'AGE',0,1,0)) a,
       TABLE(a.histogram) h
 ORDER BY a.id, h.attribute_name, h.bin_id;

-- RULES FOR LEAF CLUSTERS
-- See dmkmdemo.sql for explanation on output columns.
column confidence format 999999.99
SELECT T.id                   rule_id,
       T.rule.rule_support    support,
       T.rule.rule_confidence confidence
  FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_OC('OC_SH_Clus_sample',null,null,0,0,1)) T
ORDER BY T.id;

-- RULE DETAILS FOR LEAF CLUSTERS
-- See dmkmdemo.sql for explanation on output columns.
column aname format a25
column op format a3
column val format a60
column support format 9999
column confidence format 9.9999
SELECT T.id rule_id,
       A.attribute_name aname,
       A.conditional_operator op,
       NVL(A.attribute_str_value,
         ROUND(A.attribute_num_value,4)) val,
       A.attribute_support support,
       A.attribute_confidence confidence
  FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_OC('OC_SH_Clus_sample',null,null,0,0,2)) T,
       TABLE(T.rule.antecedent) A
 WHERE T.id < 3
ORDER BY 1, 2, 3, 4, 5, 6;

-----------------------------------------------------------------------
--                               TEST THE MODEL
-----------------------------------------------------------------------

-- There is no specific set of testing parameters for Clustering.
-- Examination and analysis of clusters is the main method to prove
-- the efficacy of a clustering model.
--

-----------------------------------------------------------------------
--                               APPLY THE MODEL
-----------------------------------------------------------------------
-- For a descriptive mining function like Clustering, "Scoring" involves
-- assigning the probability with which a given case belongs to a given
-- cluster.

-------------------------------------------------
-- SCORE NEW DATA USING SQL DATA MINING FUNCTIONS
--
------------------
-- BUSINESS CASE 1
-- List the clusters into which the customers in this
-- given dataset have been grouped.
--
SELECT CLUSTER_ID(oc_sh_clus_sample USING *) AS clus, COUNT(*) AS cnt 
  FROM mining_data_apply_v
GROUP BY CLUSTER_ID(oc_sh_clus_sample USING *)
ORDER BY cnt DESC;

-- See dmkmdemo.sql for more examples

------------------
-- BUSINESS CASE 2
-- Assign 5 customers to clusters, and provide explanations for the assingments.
--
set long 20000
set line 200
set pagesize 100
column cust_id format 999999999
SELECT cust_id,
       cluster_details(oc_sh_clus_sample USING *) cluster_details
  FROM mining_data_apply_v
 WHERE cust_id <= 100005
 ORDER BY cust_id;
